packs <- c('stringi', 'stringr', 'tm', 'topicmodels', 'tidyverse')
lapply(packs, library, character.only = T)

jan <- read.csv('jan25.csv', header = T)
jan <- jan[, 1:8]

# converting the text to UTF-8
jan$text <- iconv(jan$text, from = "UTF-8", to = "UTF-8")



## LOCATION ##



# creating a location vector by matching text u'location': u' in variable user
jan$loc <- stri_extract_first_regex(jan$user, "(u'location': .*?',)")

jan$loc <- substring(jan$loc, 16, nchar(jan$loc, type = "c") - 2)

# fixing location entries
jan$loc <- stri_unescape_unicode(jan$loc)

# creating country location objects using regular expressions
egypt.locs <- c("Egypt", "Cairo", "Shubra", "Mahalla", "Mansoura", "Tanta", 
                "Qahira", "Misr\\b", "Masr", "Alex", "Suez",
                "مصر", "القاهرة", "الإسكندرية", "الجيزة", "طنطا", "شبرا" , 
                "المحلة الكبرى", "المنصورة", "السويس", "بورسعيد" , 
                "Said\\b", 'tahrir', 'caire', '\\bdonia', 'eldonia', 
                'dunia', 'ام الدنيا')

eg.only <- jan[grepl(paste(egypt.locs, collapse = "|"), 
                     jan$loc, ignore.case = T), ]            



## DENSITY ##



# creating new time variable in R's native time format
eg.only$hour <- strptime(eg.only$created_at, format = 
                       "%a %b %d %H:%M:%S +0000 %Y", tz = "GMT")
                
start <- as.POSIXct("2011-01-24 13:00:00", origin = "1970-01-01", tz="egypt")
end <- as.POSIXct("2011-01-25 19:00:00", origin = "1970-01-01", tz="egypt")

densplot <- ggplot(eg.only, aes(x = hour, ..count..*60)) + 
  geom_density(adjust = 1/5) + 
  scale_x_datetime(name = "Time", 
                   breaks = seq(start, end, length.out = 16), 
                   labels = substr(seq(start, end, length.out = 16), 11, 13),
                   limits = c(start, end)) +
  scale_y_continuous(name = "Tweets per minute") 
                
densplot2 <- densplot + theme_bw() +
  annotate(geom = "text", x = as.POSIXct(
    "2011-01-24 18:30:00", origin = "1970-01-01", tz="egypt"),
           y = -1, label = "Jan 24") + 
  annotate(geom = "text", x = as.POSIXct(
    "2011-01-25 09:30:00", origin = "1970-01-01", tz="egypt"),
           y = -1, label = "Jan 25") +
  geom_vline(xintercept = as.POSIXct("2011-01-25 00:00:00", 
                                     origin = "1970-01-01", tz="egypt"),
             color = 'grey', linetype = 'dashed')

densplot2



## EXAMPLES ##



## trajectory of a tweet
eg.only[grepl(
"آلاف نشطاء يتجهونفي مسرة من دار القضاء العالي الي ميدان التحرير عبر شارع الجلاء #jan25",
              eg.only$text), ]

## number of mentions of Tahrir
# before 1 PM
custom.start <- as.POSIXct('2011-01-25 00:00', tz = 'egypt', 
                           origin="1970-01-01")
custom.end <- as.POSIXct('2011-01-25 13:00', tz = 'egypt', 
                         origin="1970-01-01")

custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]

length(custom.int$text[grepl("tahrir", custom.int$text, ignore.case = T)])

length(custom.int$text[grepl( 'تحرير' , custom.int$text, ignore.case = T)])

(length(custom.int$text[grepl("tahrir", custom.int$text, ignore.case = T)]) + 
    length(custom.int$text[grepl( 'تحرير' , custom.int$text, ignore.case = T)]))/
  length(custom.int$text)

# after 1 PM

custom.start <- as.POSIXct('2011-01-25 13:00', tz = 'egypt', 
                           origin="1970-01-01")
custom.end <- as.POSIXct('2011-01-26 00:00', tz = 'egypt', 
                         origin="1970-01-01")

custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]

length(custom.int$text[grepl("tahrir", custom.int$text, ignore.case = T)])

length(custom.int$text[grepl( 'تحرير' , custom.int$text, ignore.case = T)])

(length(custom.int$text[grepl("tahrir", custom.int$text, ignore.case = T)]) + 
    length(custom.int$text[grepl( 'تحرير' , custom.int$text, ignore.case = T)]))/
  length(custom.int$text)



## TEXT ANALYSIS ##


# subsetting the dataset to different time periods
custom.start <- as.POSIXct('2011-01-22 00:00', tz = 'egypt')
custom.end <- as.POSIXct('2011-01-24 00:00', tz = 'egypt')
custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]

# creating a corpus from the text
corp <- VCorpus(VectorSource(custom.int$text))
# cleaning up the corpus for text analysis
dtm <- DocumentTermMatrix(corp, control = list(removePunctuation = T,
            tolower = T, stopwords = T, stemming = T, stripWhitespace = T))
# estimating 5 topics
lda.1 <- LDA(dtm, control = list(seed = 1298), k = 5, method = "Gibbs")

custom.start <- as.POSIXct('2011-01-24 00:00', tz = 'egypt')
custom.end <- as.POSIXct('2011-01-25 00:00', tz = 'egypt')
custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]
corp <- VCorpus(VectorSource(custom.int$text))
dtm <- DocumentTermMatrix(corp, control = list(removePunctuation = T,
            tolower = T, stopwords = T, stemming = T, stripWhitespace = T))
lda.2 <- LDA(dtm, control = list(seed = 1372), k = 5, method = "Gibbs")

custom.start <- as.POSIXct('2011-01-25 00:00', tz = 'egypt')
custom.end <- as.POSIXct('2011-01-26 00:00', tz = 'egypt')
custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]
corp <- VCorpus(VectorSource(custom.int$text))
dtm <- DocumentTermMatrix(corp, control = list(removePunctuation = T,
            tolower = T, stopwords = T, stemming = T, stripWhitespace = T))
lda.3 <- LDA(dtm, control = list(seed = 2789), k = 5, method = "Gibbs")

custom.start <- as.POSIXct('2011-01-26 00:00', tz = 'egypt')
custom.end <- as.POSIXct('2011-01-29 00:00', tz = 'egypt')
custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]
corp <- VCorpus(VectorSource(custom.int$text))
dtm <- DocumentTermMatrix(corp, control = list(removePunctuation = T,
            tolower = T, stopwords = T, stemming = T, stripWhitespace = T))
lda.4 <- LDA(dtm, control = list(seed = 1188), k = 5, method = "Gibbs")

custom.start <- as.POSIXct('2011-02-02 00:00', tz = 'egypt')
custom.end <- as.POSIXct('2011-02-11 00:00', tz = 'egypt')
custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]
corp <- VCorpus(VectorSource(custom.int$text))
dtm <- DocumentTermMatrix(corp, control = list(removePunctuation = T,
            tolower = T, stopwords = T, stemming = T, stripWhitespace = T))
lda.5 <- LDA(dtm, control = list(seed = 529), k = 5, method = "Gibbs")

custom.start <- as.POSIXct('2011-02-11 00:00', tz = 'egypt')
custom.end <- as.POSIXct('2011-02-12 00:00', tz = 'egypt')
custom.int <- eg.only[eg.only$hour > custom.start & eg.only$hour < custom.end, ]
corp <- VCorpus(VectorSource(custom.int$text))
dtm <- DocumentTermMatrix(corp, control = list(removePunctuation = T,
            tolower = T, stopwords = T, stemming = T, stripWhitespace = T))
lda.6 <- LDA(dtm, control = list(seed = 2169), k = 5, method = "Gibbs")


# manually coded each topic into one of six categories
dates <- c('Jan 22-23', 'Jan 24', 'Jan 25', 'Jan 26-28', 'Feb 2-10', 'Feb 11')
dat <- read.table(text = "
                  3 2 1 0 4 4  
                  2 2 0 0 0 0
                  0 0 3 2 0 0
                  0 0 1 0 1 0
                  0 0 0 1 0 0
                  0 1 0 2 0 1
                  ", header = F, 
                  row.names = c('Opinions & slogans', 'Coordination', 
                                'News & updates', 'Referrals', 'Documentation',
                                'Other'),
                  col.names = dates)


angle1 <- rep(c(45,45,135), length.out=6)
angle2 <- rep(c(45,135,135), length.out=6)
density1 <- c(0, 8, 16, 24, 32, 0) 
density2 <- c(0, 8, 16, 24, 32, 0)

barplot(as.matrix(dat), ylim = c(0, 5.9), main = "", col=c(0,0,0,0,0,1)
)
barplot(as.matrix(dat),  add = T, ylim = c(0, 5.9), main = "",  
        angle=angle2, density=density2, col = "gray40"
)
barplot(as.matrix(dat), add = T, ylim = c(0, 5.9), main = ""
        , angle=angle1, density=density1, col = "gray40"
)
legend("top", row.names(dat), ncol = 6, fill =c(F,F,F,F,F,T),
       bty = 'n', cex = 1, text.font = 1, merge = F#,
)
legend("top", row.names(dat), ncol = 6, fill =T, col = "gray40",
       bty = 'n', cex = 1, text.font = 1, merge = F,
       angle=angle1, density=density1#,
)
legend("top", row.names(dat), ncol = 6, fill =T, col = "gray40",
       bty = 'n', cex = 1, text.font = 1, merge = F,
       angle=angle2, density=density2#,
)
